# coding:utf-8

import urllib2, re

from bs4 import BeautifulSoup

request = urllib2.urlopen("http://star.news.sohu.com/20160611/n453846195.shtml")
html = request.read()

# 创建soup对象 (文档内容，解析器，网页编码)
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')

links = soup.find_all('a', href=re.compile(r'http://(star\.)?news.sohu.com/[\d]+/n[\d]+.shtml'))
for link in links:
    print link.name
    print link['href']
    print link.get_text()
    print '======================================'

print '本文内容'
title = soup.find('h1')
print '标题', title.get_text()
content = soup.find('div', id='contentText')
print '正文'
texts = content.find_all('p')
text = ''
for p in texts:
    text += str(p)
print text